/* Copyright 2010 Speech and Language Technologies Lab, The Ohio State University
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.osu.slate.relatedness.swwr.setup.wordmapping;

import java.io.*;
import java.util.*;

import edu.osu.slate.relatedness.Configuration;
import edu.osu.slate.relatedness.swwr.data.graph.IDIDRedirect;
import edu.osu.slate.relatedness.swwr.data.graph.IDVertexTranslation;

/**
 * Creates a simple word-to-vertex mapping for the Wiki graph using the Page title.
 * 
 * Requires initialized {@link IDVertexTranslation} and {@link IDIDRedirect} classes.  All output text is lowercase.
 * 
 * Configuration File Requirements
 * <ul>
 * <li><b>basedir</b> -- base directory for the files </li>
 * <li><b>sourcedir</b> -- raw sql/xml data file directory (default: source)</li>
 * <li><b>binarydir</b> -- generated binary file directory (default: binary)</li>
 * <li><b>tempdir</b> -- directory to store temporary files (default: tmp)</li>
 * <li><b>type</b> -- type of wiki to read (enwiki or enwiktionary)</li>
 * <li><b>date</b> -- date of wiki dump</li>
 * <li><b>graph</b> -- graph source information</li>
 * </ul>
 * 
 * The output of this program is a .titlewordmap file placed in the temp directory.  It will be used as an input file for the {@link CreateMappings} program.
 * 
 * @author weale
 *
 */
public class CreateExactTitleWordMapping {

  /* Name of the Valid ID file generated by CreateValidIDFile (.vid) */
  private static String vidFileName;

  /* Name of the input file generated by CreateRedirectFiles (.rdr) */
  private static String rdrFileName;
  
  /* Wiki page.sql file name */
  private static String sqlFileName;
  
  /* temporary title pair file name */
  private static String titleIDFileName;
  
  private static IDVertexTranslation vid;
  private static IDIDRedirect rdr;
    
  /**
   * Creates a (term, vertex) pairs from the titles of a wiki data source.
   * 
   * @param args
   * @throws IOException 
   */
  public static void main(String[] args) throws IOException {
    
    if(args.length == 1)
    {
      Configuration.parseConfigurationFile(args[0]);
    }
    else
    {
      Configuration.parseConfigurationFile("/scratch/weale/data/config/enwiktionary/CreateTitleWordMapping.xml");
    }
    
    System.out.println("Initializing Valid ID List.");
    try
    {
      vidFileName = Configuration.baseDir + "/" + 
                    Configuration.binaryDir + "/" +
                    Configuration.type + "/" +
                    Configuration.date + "/" +
                    Configuration.type + "-" +
                    Configuration.date + "-" +
                    Configuration.graph + ".vid";
      
      ObjectInputStream in = new ObjectInputStream(new FileInputStream(vidFileName));
      vid = (IDVertexTranslation) in.readObject();
      in.close();
    }
    catch(Exception e)
    {
      System.err.println("Problem reading from file: " + vidFileName);
      e.printStackTrace();
      System.exit(1);
    }
    
    System.out.println("Initializing Redirect List.");
    try
    {
      rdrFileName = Configuration.baseDir + "/" +
                    Configuration.binaryDir + "/" +
                    Configuration.type+ "/" +
                    Configuration.date + "/" +
                    Configuration.type + "-"+
                    Configuration.date + "-" +
                    Configuration.graph + ".rdr";
      
      ObjectInputStream in = new ObjectInputStream(new FileInputStream(rdrFileName));
      rdr = (IDIDRedirect) in.readObject();
      in.close();
    }
    catch(Exception e)
    {
      System.err.println("Problem reading from file: " + rdrFileName);
      e.printStackTrace();
      System.exit(1);
    }
    
    sqlFileName = Configuration.baseDir + "/" + 
                  Configuration.sourceDir + "/" +
                  Configuration.type+ "/" +
                  Configuration.date + "/" +
                  Configuration.type + "-" +
                  Configuration.date + "-" +
                  "page.sql";
                  
    System.out.println("Opening page.sql File");
    Scanner in = new Scanner(new FileReader(sqlFileName));
    
    titleIDFileName = Configuration.baseDir + "/" +
                      Configuration.tempDir + "/" +
                      Configuration.type + "-"+
                      Configuration.date + "-" +
                      Configuration.graph + 
                      "-ex.titlewordmap";
    
    System.out.println("Opening .titlewordmap File for Writing");
    ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(titleIDFileName));
    
   /* 
    * Create the List of Surface Forms.
    */
    System.out.println("Writing Page Titles and Vertex IDs to File");

    // Strip beginning lines from file
    String str = in.nextLine();
    while(str.indexOf("INSERT INTO") == -1) {
      str = in.nextLine();
    }
    
    while(str != null && !str.trim().equals(""))
    {
      str = str.substring(str.indexOf("(")+1, str.length()-3);

      // Split the String into the individual page information
      String [] arr = str.split("\\d\\),\\(");
      for(int i = 0; i < arr.length; i++)
      {
        String [] info = arr[i].split(",");
          
        // Information is in the correct format if the length == 11
        if(info.length >= 11)
        {
          // Extract page, namespace and redirect information
          String page = info[0];
          String namespace = info[1];
          String redirect = info[info.length-6];
              
          // Extract title information
          String title = info[2];
          for(int j=3; j<info.length-8;j++)
          {
            title = title + "," + info[j];
          }
          
          // Remove quotes, replace underscore characters
          title = title.substring(1,title.length()-1).replace('_', ' ').toLowerCase();
          
          int pageID = Integer.parseInt(page);
              
          // Add the ID if it's in the needed namespace and not a redirect
          if(namespace.equals("0") && redirect.equals("0") && vid.isValidWikiID(pageID))
          {
            // Write title to the file
            //title = addAmbiguity(title, p);
            out.writeObject(title);
            
            // Write the vertex number to the file
            if(vid.getVertex(pageID) < 0)
            {
              System.err.println(pageID + ":" + title);
            }
            out.writeInt( vid.getVertex(pageID) );
          }//end: if()
              
          // Add the ID after redirect
          else if(namespace.equals("0") && redirect.equals("1") &&
                  rdr.isRedirectID(pageID))
          {
            // Write title to the file            
            //title = addAmbiguity(title, p);
            out.writeObject(title);
            
            // Write the vertex number to the file
            int tmpID = rdr.redirectIDToValidID(pageID);
            if( vid.getVertex(tmpID) < 0 )
            {
              System.err.println(tmpID + ">" + title);
            }
            out.writeInt( vid.getVertex(tmpID) );
          }//end: else if()
              
        }//end: if(info.length)
      }//end: for(i)
      
      str = in.nextLine();
    }//end: while(str)
    
    out.close();
  }//end: main()
}//end: CreateExactTitleWordMapping
